import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import os
files=os.listdir()[-8:]
files
['uber-raw-data-apr14.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-janjune-15.csv', 'uber-raw-data-jul14.csv', 'uber-raw-data-jun14.csv', 'uber-raw-data-may14.csv', 'uber-raw-data-sep14.csv', 'Uber.ipynb']
files.remove('uber-raw-data-janjune-15.csv')
files.remove('Uber.ipynb')
files
['uber-raw-data-apr14.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-jul14.csv', 'uber-raw-data-jun14.csv', 'uber-raw-data-may14.csv', 'uber-raw-data-sep14.csv']
#blank dataframe
final=pd.DataFrame()
for file in files:
df=pd.read_csv(file,encoding='utf-8')
final=pd.concat([df,final])
final.shape
(4534327, 4)
final.head()
| Date/Time | Lat | Lon | Base | |
|---|---|---|---|---|
| 0 | 9/1/2014 0:01:00 | 40.2201 | -74.0021 | B02512 |
| 1 | 9/1/2014 0:01:00 | 40.7500 | -74.0027 | B02512 |
| 2 | 9/1/2014 0:03:00 | 40.7559 | -73.9864 | B02512 |
| 3 | 9/1/2014 0:06:00 | 40.7450 | -73.9889 | B02512 |
| 4 | 9/1/2014 0:11:00 | 40.8145 | -73.9444 | B02512 |
df = final.copy()
df.dtypes
Date/Time object Lat float64 Lon float64 Base object dtype: object
df['Date/Time'] = pd.to_datetime(df['Date/Time'],format="%m/%d/%Y %H:%M:%S")
df.dtypes
Date/Time datetime64[ns] Lat float64 Lon float64 Base object dtype: object
df['weekday']=df['Date/Time'].dt.day_name()
df['day']=df['Date/Time'].dt.day
df['minute']=df['Date/Time'].dt.minute
df['month']=df['Date/Time'].dt.month
df['hour']=df['Date/Time'].dt.hour
df.head()
| Date/Time | Lat | Lon | Base | weekday | day | minute | month | hour | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 2014-09-01 00:01:00 | 40.2201 | -74.0021 | B02512 | Monday | 1 | 1 | 9 | 0 |
| 1 | 2014-09-01 00:01:00 | 40.7500 | -74.0027 | B02512 | Monday | 1 | 1 | 9 | 0 |
| 2 | 2014-09-01 00:03:00 | 40.7559 | -73.9864 | B02512 | Monday | 1 | 3 | 9 | 0 |
| 3 | 2014-09-01 00:06:00 | 40.7450 | -73.9889 | B02512 | Monday | 1 | 6 | 9 | 0 |
| 4 | 2014-09-01 00:11:00 | 40.8145 | -73.9444 | B02512 | Monday | 1 | 11 | 9 | 0 |
df['Base'].unique()
array(['B02512', 'B02598', 'B02617', 'B02682', 'B02764'], dtype=object)
df['day'].unique()
array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31],
dtype=int64)
df['weekday'].unique()
array(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday',
'Sunday'], dtype=object)
pip install plotly
Requirement already satisfied: plotly in c:\programdata\anaconda3\lib\site-packages (4.14.3) Requirement already satisfied: retrying>=1.3.3 in c:\programdata\anaconda3\lib\site-packages (from plotly) (1.3.3) Requirement already satisfied: six in c:\programdata\anaconda3\lib\site-packages (from plotly) (1.15.0) Note: you may need to restart the kernel to use updated packages.
import plotly.express as px
px.bar(x=df['weekday'].value_counts().index,
y=df['weekday'].value_counts().values
)
for i,month in enumerate(df['month'].unique()):
print(month)
plt.figure(figsize=(60,30))
for i,month in enumerate(df['month'].unique()):
plt.subplot(3,2,i+1)
df[df['month']==month]['hour'].hist()
9 5 6 7 8 4
## Analysis of Rush of each hour in each month
for i in df['month'].unique():
plt.figure(figsize=(5,3))
plt.title("Month {}".format(i) )
df[df['month']==i]['hour'].hist()
# conda install -c plotly chart-studio
Collecting package metadata (current_repodata.json): ...working... done Solving environment: ...working... done # All requested packages already installed. Note: you may need to restart the kernel to use updated packages.
import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
#Choose Max Month For Riding
trace1 = go.Bar(
x = df.groupby('month')['hour'].sum().index,
y = df.groupby('month')['hour'].sum(),
name= 'Priority')
iplot([trace1])
# Analysis for Journey for Each Day
plt.figure(figsize=(10,6))
plt.hist(df['day'], bins=30, rwidth=.8, range=(0.5, 30.5))
plt.xlabel('date of the month')
plt.ylabel('Total Journeys')
plt.title('Journeys by Month Day');
df['month'].unique()
array([9, 5, 6, 7, 8, 4], dtype=int64)
## Analysis of Total rides month-wise
plt.figure(figsize=(20,10))
for i,month in enumerate(df['month'].unique(),1):
plt.subplot(3,2,i)
df_out=df[df['month']==month]
plt.hist(df_out['day'])
plt.xlabel('days in month {}'.format(month))
plt.ylabel('total rides')
sns.set_style(style='whitegrid')
ax = sns.pointplot(x="hour",y="Lat",data=df,hue='weekday')
ax.set_title("hours of day Vs latitude of passenger ")
Text(0.5, 1.0, 'hours of day Vs latitude of passenger ')
df.groupby(['Base','month'])['Date/Time'].count()
Base month
B02512 4 35536
5 36765
6 32509
7 35021
8 31472
9 34370
B02598 4 183263
5 260549
6 242975
7 245597
8 220129
9 240600
B02617 4 108001
5 122734
6 184460
7 310160
8 355803
9 377695
B02682 4 227808
5 222883
6 194926
7 196754
8 173280
9 197138
B02764 4 9908
5 9504
6 8974
7 8589
8 48591
9 178333
Name: Date/Time, dtype: int64
base=df.groupby(['Base','month'])['Date/Time'].count().reset_index()
base
| Base | month | Date/Time | |
|---|---|---|---|
| 0 | B02512 | 4 | 35536 |
| 1 | B02512 | 5 | 36765 |
| 2 | B02512 | 6 | 32509 |
| 3 | B02512 | 7 | 35021 |
| 4 | B02512 | 8 | 31472 |
| 5 | B02512 | 9 | 34370 |
| 6 | B02598 | 4 | 183263 |
| 7 | B02598 | 5 | 260549 |
| 8 | B02598 | 6 | 242975 |
| 9 | B02598 | 7 | 245597 |
| 10 | B02598 | 8 | 220129 |
| 11 | B02598 | 9 | 240600 |
| 12 | B02617 | 4 | 108001 |
| 13 | B02617 | 5 | 122734 |
| 14 | B02617 | 6 | 184460 |
| 15 | B02617 | 7 | 310160 |
| 16 | B02617 | 8 | 355803 |
| 17 | B02617 | 9 | 377695 |
| 18 | B02682 | 4 | 227808 |
| 19 | B02682 | 5 | 222883 |
| 20 | B02682 | 6 | 194926 |
| 21 | B02682 | 7 | 196754 |
| 22 | B02682 | 8 | 173280 |
| 23 | B02682 | 9 | 197138 |
| 24 | B02764 | 4 | 9908 |
| 25 | B02764 | 5 | 9504 |
| 26 | B02764 | 6 | 8974 |
| 27 | B02764 | 7 | 8589 |
| 28 | B02764 | 8 | 48591 |
| 29 | B02764 | 9 | 178333 |
## which base number gets popular by month name
plt.figure(figsize=(10,6))
sns.lineplot(x='month',y='Date/Time',hue='Base',data=base);
def count_rows(rows):
return len(rows)
by_cross = df.groupby(['weekday','hour']).apply(count_rows)
by_cross
weekday hour
Friday 0 13716
1 8163
2 5350
3 6930
4 8806
...
Wednesday 19 47017
20 47772
21 44553
22 32868
23 18146
Length: 168, dtype: int64
pivot=by_cross.unstack()
pivot
| hour | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| weekday | |||||||||||||||||||||
| Friday | 13716 | 8163 | 5350 | 6930 | 8806 | 13450 | 23412 | 32061 | 31509 | 25230 | ... | 36206 | 43673 | 48169 | 51961 | 54762 | 49595 | 43542 | 48323 | 49409 | 41260 |
| Monday | 6436 | 3737 | 2938 | 6232 | 9640 | 15032 | 23746 | 31159 | 29265 | 22197 | ... | 28157 | 32744 | 38770 | 42023 | 37000 | 34159 | 32849 | 28925 | 20158 | 11811 |
| Saturday | 27633 | 19189 | 12710 | 9542 | 6846 | 7084 | 8579 | 11014 | 14411 | 17669 | ... | 31418 | 38769 | 43512 | 42844 | 45883 | 41098 | 38714 | 43826 | 47951 | 43174 |
| Sunday | 32877 | 23015 | 15436 | 10597 | 6374 | 6169 | 6596 | 8728 | 12128 | 16401 | ... | 28151 | 31112 | 33038 | 31521 | 28291 | 25948 | 25076 | 23967 | 19566 | 12166 |
| Thursday | 9293 | 5290 | 3719 | 5637 | 8505 | 14169 | 27065 | 37038 | 35431 | 27812 | ... | 36699 | 44442 | 50560 | 56704 | 55825 | 51907 | 51990 | 51953 | 44194 | 27764 |
| Tuesday | 6237 | 3509 | 2571 | 4494 | 7548 | 14241 | 26872 | 36599 | 33934 | 25023 | ... | 34846 | 41338 | 48667 | 55500 | 50186 | 44789 | 44661 | 39913 | 27712 | 14869 |
| Wednesday | 7644 | 4324 | 3141 | 4855 | 7511 | 13794 | 26943 | 36495 | 33826 | 25635 | ... | 35148 | 43388 | 50684 | 55637 | 52732 | 47017 | 47772 | 44553 | 32868 | 18146 |
7 rows × 24 columns
plt.figure(figsize=(10,6))
sns.heatmap(pivot, annot=False);
def heatmap(col1,col2):
by_cross = df.groupby([col1,col2]).apply(lambda x:len(x))
pivot=by_cross.unstack()
plt.figure(figsize=(10,6))
return sns.heatmap(pivot,annot=False)
heatmap('day','hour');
heatmap('day','month');
## Analysis of Location data points
plt.figure(figsize=(10,6))
plt.plot(df['Lon'], df['Lat'],'r+', ms=0.5)
plt.xlim(-74.2, -73.7)
plt.ylim(40.6,41);
df_out=df[df['weekday']=='Sunday']
df_out.head()
| Date/Time | Lat | Lon | Base | weekday | day | minute | month | hour | |
|---|---|---|---|---|---|---|---|---|---|
| 8011 | 2014-09-07 00:00:00 | 40.7341 | -74.0005 | B02512 | Sunday | 7 | 0 | 9 | 0 |
| 8012 | 2014-09-07 00:00:00 | 40.7344 | -73.9900 | B02512 | Sunday | 7 | 0 | 9 | 0 |
| 8013 | 2014-09-07 00:00:00 | 40.7806 | -73.9582 | B02512 | Sunday | 7 | 0 | 9 | 0 |
| 8014 | 2014-09-07 00:01:00 | 40.7293 | -73.9859 | B02512 | Sunday | 7 | 1 | 9 | 0 |
| 8015 | 2014-09-07 00:01:00 | 40.7713 | -74.0133 | B02512 | Sunday | 7 | 1 | 9 | 0 |
df_out.groupby(['Lat','Lon'])['weekday'].count().reset_index()
| Lat | Lon | weekday | |
|---|---|---|---|
| 0 | 39.9374 | -74.0722 | 1 |
| 1 | 39.9378 | -74.0721 | 1 |
| 2 | 39.9384 | -74.0742 | 1 |
| 3 | 39.9385 | -74.0734 | 1 |
| 4 | 39.9415 | -74.0736 | 1 |
| ... | ... | ... | ... |
| 209225 | 41.3141 | -74.1249 | 1 |
| 209226 | 41.3180 | -74.1298 | 1 |
| 209227 | 41.3195 | -73.6905 | 1 |
| 209228 | 41.3197 | -73.6903 | 1 |
| 209229 | 42.1166 | -72.0666 | 1 |
209230 rows × 3 columns
import folium
from folium.plugins import HeatMap
basemap=folium.Map()
HeatMap(df_out.groupby(['Lat','Lon'])['weekday'].count().reset_index(),zoom=20,radius=15).add_to(basemap)
basemap
##function for a specific day
def plot(df,day):
df_out=df[df['weekday']==day]
df_out.groupby(['Lat','Lon'])['weekday'].count().reset_index()
HeatMap(df_out.groupby(['Lat','Lon'])['weekday'].count().reset_index(),zoom=20,radius=15).add_to(basemap)
return basemap
plot(df,'Sunday')